Statistik 2
Hochschule AalenSommersemester 2021
Martin Heckmann
martin.heckmann@hs-aalen.de
19.05.2021
Target of the assignment is to understand the principles of parameter estimation.
americans.csvgraph_objects.Histogram. Overlay the histograms for the means, medians and modes in one histogram using opacity=0.75 as follows:fig = go.Figure()
fig.add_trace(go.Histogram(x= ...
fig.add_trace(go.Histogram(x= ...
...
fig.update_traces(opacity=0.75, ...
fig.update_layout(
title='Height'
)
fig.show()
`
from sklearn.linear_model import LinearRegression
from sklearn import metrics
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
print(linreg.intercept, linreg.coef)
y_predict = lin_reg.predict(X_test)
MSE = metrics.mean_squared_error(y_test, y_predict)
bin_width = 1 fig = plotly.graph_objects.Figure() fig.add_trace(plotly.graph_objects.Histogram(x=data_frame[data_frame['size']==size1, name=size1,xbins=dict(size=bin_width))) fig.add_trace(plotly.graph_objects.Histogram(x=data_frame[data_frame['size']==size2, name=size2, xbins=dict(size=bin_width)))
fig.add_scatter(x=[population_value,population_value],y=[0, hist_height],name="population value")
fig.update_traces(opacity=0.75) fig.update_layout( title="MSE", barmode='overlay' ) fig.show()
#Task1
#As previously, randomly extract a sample of 𝑁 data points from the data set americans.csv
import pandas
import numpy
import plotly.graph_objects
import plotly.express
numpy.random.seed(31)
americans=pandas.read_csv("americans.csv",delimiter=",")
americans
| Unnamed: 0 | CaseID | Age | Sex | BMI | Height (in) | Weight (lb) | Height (cm) | Weight (kg) | DoesGroceries | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 20160101160045 | 62 | 2 | 26.6 | 66 | 165 | 167.64 | 74.910 | True |
| 1 | 1 | 20160101160066 | 69 | 1 | 44.3 | 69 | 300 | 175.26 | 136.200 | True |
| 2 | 2 | 20160101160069 | 24 | 2 | 24.5 | 64 | 143 | 162.56 | 64.922 | True |
| 3 | 3 | 20160101160083 | 31 | 2 | 21.2 | 57 | 98 | 144.78 | 44.492 | True |
| 4 | 4 | 20160101160084 | 59 | 2 | 29.7 | 64 | 173 | 162.56 | 78.542 | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9693 | 10205 | 20161212162307 | 52 | 2 | 33.6 | 65 | 202 | 165.10 | 91.708 | False |
| 9694 | 10206 | 20161212162357 | 69 | 1 | 26.5 | 68 | 174 | 172.72 | 78.996 | True |
| 9695 | 10207 | 20161212162426 | 28 | 1 | 27.1 | 64 | 158 | 162.56 | 71.732 | True |
| 9696 | 10208 | 20161212162456 | 80 | 2 | 44.9 | 65 | 270 | 165.10 | 122.580 | False |
| 9697 | 10209 | 20161212162509 | 16 | 2 | 22.8 | 68 | 150 | 172.72 | 68.100 | False |
9698 rows × 10 columns
dataframes=[]
N=[10,50,100,500,1000]
for i in range(len(N)):
#calculate mean, median, mode and standard deviation of the height and weight
mean_h,mean_w,median_h,median_w,mode_w,mode_h,std_h,std_w,men_ratio=[],[],[],[],[],[],[],[],[]
for j in range(1000):
sample=americans.sample(n=N[i])
mean_h.append(sample['Height (cm)'].mean())
mean_w.append(sample['Weight (kg)'].mean())
median_h.append(sample['Height (cm)'].median())
median_w.append(sample['Weight (kg)'].median())
mode_h.append(sample['Height (cm)'].mode()[0])
mode_w.append(sample['Weight (kg)'].mode()[0])
std_h.append(sample['Height (cm)'].std())
std_w.append(sample['Weight (kg)'].std())
try:
men_ratio.append(sample['Sex'].value_counts().get(1)/N[i])
except:
men_ratio.append(0.0)
df=pandas.DataFrame()
df['mean_h']=mean_h
df['mean_w']=mean_w
df['median_h']=median_h
df['median_w']=mean_w
df['mode_h']=mode_h
df['mode_w']=mode_w
df['std_h']=std_h
df['std_w']=std_w
df['men_ratio']=men_ratio
dataframes.append(df)
#dataframes[4]
F_mean_h=americans['Height (cm)'].mean()
F_median_h=americans['Height (cm)'].median()
F_mode_h=americans['Height (cm)'].mode()[0]
F_std_h=americans['Height (cm)'].std()
F_mean_w=americans['Weight (kg)'].mean()
F_median_w=americans['Weight (kg)'].median()
F_mode_w=americans['Weight (kg)'].mode()[0]
F_std_w=americans['Weight (kg)'].std()
for i in range(len(N)):
#max_=dataframes[i]['mean_h'].max()
max_=max(dataframes[i]['mean_h'].max(),dataframes[i]['median_h'].max(),dataframes[i]['mode_h'].max())
min_=min(dataframes[i]['mean_h'].min(),dataframes[i]['median_h'].min(),dataframes[i]['mode_h'].min())
#min_=dataframes[i]['mean_h'].min()
#print(sampleResults[i].count())
bin_width=(max_-min_)/21
#print(bin_width)
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(marker=dict(color='blue'),x=dataframes[i]['mean_h'], name='mean',xbins=dict(size=bin_width)))
fig.add_trace(plotly.graph_objects.Histogram(marker=dict(color='orange'),x=dataframes[i]['median_h'], name='median',xbins=dict(size=bin_width)))
fig.add_trace(plotly.graph_objects.Histogram(marker=dict(color='green'),x=dataframes[i]['mode_h'], name='mode',xbins=dict(size=bin_width)))
fig.add_trace(plotly.graph_objects.Histogram(marker=dict(color='purple'),x=dataframes[i]['std_h'], name='std',xbins=dict(size=(dataframes[i]['std_h'].max()-dataframes[i]['std_h'].min())/21),visible = 'legendonly'))
N_mean=dataframes[i]['mean_h'].mean()
N_median=dataframes[i]['median_h'].median()
N_mode=dataframes[i]['mode_h'].mode()[0]
N_std=dataframes[i]['std_h'].std()
fig.add_vline(line=dict(color='blue'),x=N_mean,name="sample_mean", annotation=dict(text='sample_mean', showarrow=True))
fig.add_vline(line=dict(color='orange'),x=N_median,name="sample_mean", annotation=dict(yshift=-80,text='sample_median', showarrow=True))
fig.add_vline(line=dict(color='green'),x=N_mode,name="sample_mean", annotation=dict(yshift=-160,text='sample_mode', showarrow=True))
#fig.add_vline(line=dict(color='purple'),x=N_std,name="sample_mean", annotation=dict(text='sample_std', showarrow=True))
fig.add_vline(line=dict(color='blue'),x=F_mean_h,name="sample_mean", annotation=dict(yshift=-20,text='full_mean', showarrow=True))
fig.add_vline(line=dict(color='orange'),x=F_median_h,name="sample_mean", annotation=dict(yshift=-100,text='full_median', showarrow=True))
fig.add_vline(line=dict(color='green'),x=F_mode_h,name="sample_mean", annotation=dict(yshift=-180,text='full_mode', showarrow=True))
#fig.add_vline(line=dict(color='purple'),x=F_std_h,name="sample_mean", annotation=dict(text='full_std', showarrow=True))
#fig.add_scatter(marker=dict(color='blue'),x=[N_mean,N_mean],y=[0,270],name="sample_mean")
#fig.add_scatter(marker=dict(color='orange'),x=[N_median,N_median],y=[0,270],name="sample_median")
#fig.add_scatter(marker=dict(color='green'),x=[N_mode,N_mode],y=[0,270],name="sample_mode")
fig.add_scatter(marker=dict(color='purple'),x=[N_std,N_std],y=[0,270],name="sample_std",visible='legendonly')
#fig.add_scatter(marker=dict(color='blue'),x=[F_mean_h,F_mean_h],y=[0,270],name="full_mean")
#fig.add_scatter(marker=dict(color='orange'),x=[F_median_h,F_median_h],y=[0,270],name="full_median")
#fig.add_scatter(marker=dict(color='green'),x=[F_mode_h,F_mode_h],y=[0,270],name="full_mode")
fig.add_scatter(marker=dict(color='purple'),x=[F_std_h,F_std_h],y=[0,270],name="full_std",visible='legendonly')
fig.update_traces(opacity=0.5)
fig.update_layout(
title=str(N[i])+" Heigth (cm)",
barmode='overlay'
)
fig.show()
max_=max(dataframes[i]['mean_w'].max(),dataframes[i]['median_w'].max(),dataframes[i]['mode_w'].max())
min_=min(dataframes[i]['mean_w'].min(),dataframes[i]['median_w'].min(),dataframes[i]['mode_w'].min())
#min_=dataframes[i]['mean_h'].min()
#print(sampleResults[i].count())
#bin_width=(max_-min_)/21
bin_width=1
#print(bin_width)
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(marker=dict(color='blue'),x=dataframes[i]['mean_w'], name='mean',xbins=dict(size=bin_width)))
fig.add_trace(plotly.graph_objects.Histogram(marker=dict(color='orange'),x=dataframes[i]['median_w'], name='median',xbins=dict(size=bin_width)))
fig.add_trace(plotly.graph_objects.Histogram(marker=dict(color='green'),x=dataframes[i]['mode_w'], name='mode',xbins=dict(size=bin_width)))
fig.add_trace(plotly.graph_objects.Histogram(marker=dict(color='purple'),x=dataframes[i]['std_w'], name='std',xbins=dict(size=(dataframes[i]['std_w'].max()-dataframes[i]['std_w'].min())/21),visible = 'legendonly'))
N_mean=dataframes[i]['mean_w'].mean()
N_median=dataframes[i]['median_w'].median()
N_mode=dataframes[i]['mode_w'].mode()[0]
N_std=dataframes[i]['std_w'].std()
fig.add_vline(line=dict(color='blue'),x=N_mean,name="sample_mean", annotation=dict(text='sample_mean', showarrow=True))
fig.add_vline(line=dict(color='orange'),x=N_median,name="sample_mean", annotation=dict(yshift=-80,text='sample_median', showarrow=True))
fig.add_vline(line=dict(color='green'),x=N_mode,name="sample_mean", annotation=dict(yshift=-160,text='sample_mode', showarrow=True))
fig.add_vline(line=dict(color='blue'),x=F_mean_w,name="sample_mean", annotation=dict(yshift=-20,text='full_mean', showarrow=True))
fig.add_vline(line=dict(color='orange'),x=F_median_w,name="sample_mean", annotation=dict(yshift=-100,text='full_median', showarrow=True))
fig.add_vline(line=dict(color='green'),x=F_mode_w,name="sample_mean", annotation=dict(yshift=-180,text='full_mode', showarrow=True))
fig.add_scatter(marker=dict(color='purple'),x=[N_std,N_std],y=[0,270],name="sample_std",visible='legendonly')
fig.add_scatter(marker=dict(color='purple'),x=[F_std_w,F_std_w],y=[0,270],name="full_std",visible='legendonly')
#fig.add_scatter(marker=dict(color='blue'),x=[F_mean_h,F_mean_h],y=[0,270],name="full_mean")
#fig.add_scatter(marker=dict(color='orange'),x=[F_median_h,F_median_h],y=[0,270],name="full_median")
#fig.add_scatter(marker=dict(color='green'),x=[F_mode_h,F_mode_h],y=[0,270],name="full_mode")
#fig.add_scatter(marker=dict(color='purple'),x=[F_std_h,F_std_h],y=[0,270],name="full_std",visible='legendonly')
#fig.add_scatter(marker=dict(color='blue'),x=[N_mean,N_mean],y=[0,270],name="sample_mean")
#fig.add_scatter(marker=dict(color='blue'),x=[F_mean_w,F_mean_w],y=[0,270],name="full_mean")
#fig.add_scatter(marker=dict(color='orange'),x=[N_median,N_median],y=[0,270],name="sample_median")
#fig.add_scatter(marker=dict(color='orange'),x=[F_median_w,F_median_w],y=[0,270],name="full_median")
#fig.add_scatter(marker=dict(color='green'),x=[N_mode,N_mode],y=[0,270],name="sample_mode")
#fig.add_scatter(marker=dict(color='green'),x=[F_mode_w,F_mode_w],y=[0,270],name="full_mode")
#
#fig.add_scatter(marker=dict(color='purple'),x=[F_std_w,F_std_w],y=[0,270],name="full_std",visible='legendonly')
fig.update_traces(opacity=0.5)
fig.update_layout(
title=str(N[i])+" Weight (kg)",
barmode='overlay'
)
fig.show()
#Task1 fin
df_n100=dataframes[2]
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(x=df_n100['men_ratio']))
fig.add_vline(x=americans['Sex'].value_counts().get(1)/len(americans))
fig.show()
americans_men = americans[americans['Sex'] == 1]
americans_women = americans[americans['Sex'] == 2]
mean_h,mean_w,median_h,median_w,mode_w,mode_h,std_h,std_w=[],[],[],[],[],[],[],[]
for j in range(1000):
sample=americans_men.sample(n=100)
mean_h.append(sample['Height (cm)'].mean())
mean_w.append(sample['Weight (kg)'].mean())
median_h.append(sample['Height (cm)'].median())
median_w.append(sample['Weight (kg)'].median())
mode_h.append(sample['Height (cm)'].mode()[0])
mode_w.append(sample['Weight (kg)'].mode()[0])
std_h.append(sample['Height (cm)'].std())
std_w.append(sample['Weight (kg)'].std())
df_m=pandas.DataFrame()
df_m['mean_h']=mean_h
df_m['mean_w']=mean_w
df_m['median_h']=median_h
df_m['median_w']=mean_w
df_m['mode_h']=mode_h
df_m['mode_w']=mode_w
df_m['std_h']=std_h
df_m['std_w']=std_w
mean_h,mean_w,median_h,median_w,mode_w,mode_h,std_h,std_w=[],[],[],[],[],[],[],[]
for j in range(1000):
sample=americans_women.sample(n=100)
mean_h.append(sample['Height (cm)'].mean())
mean_w.append(sample['Weight (kg)'].mean())
median_h.append(sample['Height (cm)'].median())
median_w.append(sample['Weight (kg)'].median())
mode_h.append(sample['Height (cm)'].mode()[0])
mode_w.append(sample['Weight (kg)'].mode()[0])
std_h.append(sample['Height (cm)'].std())
std_w.append(sample['Weight (kg)'].std())
df_w=pandas.DataFrame()
df_w['mean_h']=mean_h
df_w['mean_w']=mean_w
df_w['median_h']=median_h
df_w['median_w']=mean_w
df_w['mode_h']=mode_h
df_w['mode_w']=mode_w
df_w['std_h']=std_h
df_w['std_w']=std_w
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(x=df_n100['mean_h'], name='men & women',xbins=dict(size=1)))
fig.add_trace(plotly.graph_objects.Histogram(x=df_m['mean_h'], name='men',xbins=dict(size=1)))
fig.add_trace(plotly.graph_objects.Histogram(x=df_w['mean_h'], name='women',xbins=dict(size=1)))
fig.update_traces(opacity=.75)
fig.update_layout(
title="Height mean compared",
barmode='overlay'
)
fig.show()
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(x=df_n100['mean_w'], name='men & women',xbins=dict(size=1)))
fig.add_trace(plotly.graph_objects.Histogram(x=df_m['mean_w'], name='men',xbins=dict(size=1)))
fig.add_trace(plotly.graph_objects.Histogram(x=df_w['mean_w'], name='women',xbins=dict(size=1)))
fig.update_traces(opacity=.75)
fig.update_layout(
title="Weight mean compared",
barmode='overlay'
)
fig.show()
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(x=df_n100['median_h'], name='men & women',xbins=dict(size=1)))
fig.add_trace(plotly.graph_objects.Histogram(x=df_m['median_h'], name='men',xbins=dict(size=1)))
fig.add_trace(plotly.graph_objects.Histogram(x=df_w['median_h'], name='women',xbins=dict(size=1)))
fig.update_traces(opacity=.75)
fig.update_layout(
title="Height median compared",
barmode='overlay'
)
fig.show()
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(x=df_n100['median_w'], name='men & women',xbins=dict(size=1)))
fig.add_trace(plotly.graph_objects.Histogram(x=df_m['median_w'], name='men',xbins=dict(size=1)))
fig.add_trace(plotly.graph_objects.Histogram(x=df_w['median_w'], name='women',xbins=dict(size=1)))
fig.update_traces(opacity=.75)
fig.update_layout(
title="Weight median compared",
barmode='overlay'
)
fig.show()
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(x=df_n100['mode_h'], name='men & women',xbins=dict(size=1)))
fig.add_trace(plotly.graph_objects.Histogram(x=df_m['mode_h'], name='men',xbins=dict(size=1)))
fig.add_trace(plotly.graph_objects.Histogram(x=df_w['mode_h'], name='women',xbins=dict(size=1)))
fig.update_traces(opacity=.5)
fig.update_layout(
title="Height mode compared",
barmode='overlay'
)
fig.show()
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(x=df_n100['mode_w'], name='men & women',xbins=dict(size=1)))
fig.add_trace(plotly.graph_objects.Histogram(x=df_m['mode_w'], name='men',xbins=dict(size=1)))
fig.add_trace(plotly.graph_objects.Histogram(x=df_w['mode_w'], name='women',xbins=dict(size=1)))
fig.update_traces(opacity=.5)
fig.update_layout(
title="Weight mode compared",
barmode='overlay'
)
fig.show()
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(x=df_n100['std_h'], name='men & women'))
fig.add_trace(plotly.graph_objects.Histogram(x=df_m['std_h'], name='men'))
fig.add_trace(plotly.graph_objects.Histogram(x=df_w['std_h'], name='women'))
fig.update_traces(opacity=.75)
fig.update_layout(
title="Height std compared",
barmode='overlay'
)
fig.show()
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(x=df_n100['std_w'], name='men & women'))
fig.add_trace(plotly.graph_objects.Histogram(x=df_m['std_w'], name='men'))
fig.add_trace(plotly.graph_objects.Histogram(x=df_w['std_w'], name='women'))
fig.update_traces(opacity=.75)
fig.update_layout(
title="Weight std compared",
barmode='overlay'
)
fig.show()
#Task2 fin
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
erg=pandas.DataFrame()
N=[50,500]
for i in range(len(N)):
w0,w1,mse=[],[],[]
for j in range(1000):
sample=americans.sample(n=N[i])
x_train, x_test, y_train, y_test = train_test_split(sample['Height (cm)'], sample['Weight (kg)'],test_size=.3, random_state=0)
lin_reg = LinearRegression()
#Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
x_train=x_train.values.reshape(-1,1)
#fit(X training data, y target values, sample_weight=None) https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression.fit
lin_reg.fit(x_train, y_train)
#print(lin_reg.intercept_, lin_reg.coef_[0])
w0.append(lin_reg.intercept_)
w1.append(lin_reg.coef_[0])
#MSE
y_predict = lin_reg.predict(x_test.values.reshape(-1,1))
mse.append(metrics.mean_squared_error(y_test, y_predict))
erg[''+str(N[i])+'_w0']=w0
erg[''+str(N[i])+'_w1']=w1
erg[''+str(N[i])+'_mse']=mse
#print(''+str(N[i])+'_mse')
#whole populatation 70% training 30%test
x_train, x_test, y_train, y_test = train_test_split(sample['Height (cm)'], sample['Weight (kg)'],test_size=.3, random_state=0)
pop_lin_reg = LinearRegression()
x_train=x_train.values.reshape(-1,1)
pop_lin_reg.fit(x_train, y_train)
y_predict = pop_lin_reg.predict(x_test.values.reshape(-1,1))
pop_mse=metrics.mean_squared_error(y_test, y_predict)
for i in range(len(N)):
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(x=erg[''+str(N[i])+'_w0'], name='w0'))
fig.add_trace(plotly.graph_objects.Histogram(x=erg[''+str(N[i])+'_w1'], name='w1'))
z=erg[''+str(N[i])+'_w0'].mean()
fig.add_trace(plotly.graph_objects.Scatter(x=[z,z],y=[0,80], name='w0 mean', mode='lines',marker=dict(color='#636EFA' )))
z=erg[''+str(N[i])+'_w1'].mean()
fig.add_trace(plotly.graph_objects.Scatter(x=[z,z],y=[0,80], name='w1 mean', mode='lines',marker=dict(color='#EF553B' )))
z=erg[''+str(N[i])+'_w0'].std()
fig.add_trace(plotly.graph_objects.Scatter(x=[z,z],y=[0,80], name='w0 std', mode='lines',visible='legendonly'))
z=erg[''+str(N[i])+'_w1'].std()
fig.add_trace(plotly.graph_objects.Scatter(x=[z,z],y=[0,80], name='w1 std', mode='lines',visible='legendonly'))
z=pop_lin_reg.intercept_
fig.add_trace(plotly.graph_objects.Scatter(x=[z,z],y=[0,80], name='wo pop', mode='lines',marker=dict(color='#4F5FF7' )))
z=pop_lin_reg.coef_[0]
fig.add_trace(plotly.graph_objects.Scatter(x=[z,z],y=[0,80], name='w1 pop', mode='lines',marker=dict(color='#ED3012' )))
fig.update_traces(opacity=.75)
fig.update_layout(
title='n='+str(N[i]),
barmode='overlay'
)
fig.show()
fig = plotly.graph_objects.Figure()
fig.add_trace(plotly.graph_objects.Histogram(x=erg['50_mse'], name='mse n=50'))
fig.add_trace(plotly.graph_objects.Histogram(x=erg['500_mse'], name='mse n=500'))
z=erg['50_mse'].mean()
fig.add_trace(plotly.graph_objects.Scatter(x=[z,z], y=[0, 110], name='mse mean n=50', mode='lines',marker=dict(color='#4F5FF7' )))
z=erg['500_mse'].mean()
fig.add_trace(plotly.graph_objects.Scatter(x=[z, z], y=[0, 110], name='mse mean n=500', mode='lines',marker=dict(color='#ED3012' )))
fig.add_trace(plotly.graph_objects.Scatter(x=[pop_mse, pop_mse], y=[0, 120], name='mse pop', mode='lines'))
fig.update_traces(opacity=.75)
fig.update_layout(
title='MSE',
barmode='overlay'
)
fig.show()